<<<<<<< HEAD Areal-AutoCorr.knit

SISMID Spatial Satistics

Data Lab 3 Working with Areal Data


3.1 Working with Spatial Polygons

We will first load a standard data frame consisting of several county-level variables:

library(here)
library(sf)
library(maps)
library(stringr)
library(colorRamps)
library(ggplot2)
library(spdep)
library(DCluster)

load ( here("data", "Data.RData") )
summary (dat)
##      FIPS                        Area     State      Population    
##  Length:226         Baldwin County :  2   AL: 67   Min.   :  1884  
##  Class :character   Bibb County    :  2   GA:159   1st Qu.: 13321  
##  Mode  :character   Calhoun County :  2            Median : 24790  
##                     Cherokee County:  2            Mean   : 62711  
##                     Clarke County  :  2            3rd Qu.: 61806  
##                     Clay County    :  2            Max.   :992137  
##                     (Other)        :214                            
##      Cases            Income           inc        
##  Min.   :   6.0   Min.   :16646   Min.   :  5.00  
##  1st Qu.:  48.0   1st Qu.:27477   1st Qu.: 25.00  
##  Median :  91.0   Median :31186   Median : 41.00  
##  Mean   : 299.3   Mean   :33255   Mean   : 45.88  
##  3rd Qu.: 223.8   3rd Qu.:36738   3rd Qu.: 61.00  
##  Max.   :6139.0   Max.   :71227   Max.   :131.00  
## 

Next, we will read in county spatial polygons for the contiguous US in the maps package. R can also read in any shapefile via the sf package. The st_as_sf function converts the map object to an sf object, which encodes the ID, polygons, and any unit-specific data value.

county.map = map ("county", region = c("alabama", "georgia"),fill = T,  plot = F)
county.map = sf::st_as_sf(county.map) #Convert from map object to sf

plot (st_geometry(county.map)) #only plot the polygons

Our sf object contains several attributes:

str (county.map)
## Classes 'sf' and 'data.frame':   226 obs. of  2 variables:
##  $ ID  : chr  "alabama,autauga" "alabama,baldwin" "alabama,barbour" "alabama,bibb" ...
##  $ geom:sfc_MULTIPOLYGON of length 226; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:51, 1:2] -86.5 -86.5 -86.5 -86.6 -86.6 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geom"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA
##   ..- attr(*, "names")= chr "ID"
attributes (county.map)
## $names
## [1] "ID"   "geom"
## 
## $row.names
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [91]  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## [199] 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## [217] 217 218 219 220 221 222 223 224 225 226
## 
## $sf_column
## [1] "geom"
## 
## $agr
##   ID 
## <NA> 
## Levels: constant aggregate identity
## 
## $class
## [1] "sf"         "data.frame"

Let’s look some meta data information:

print (county.map, n = 2)
## Simple feature collection with 226 features and 1 field
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -88.47614 ymin: 30.24071 xmax: -80.84435 ymax: 35.01345
## Geodetic CRS:  WGS 84
## First 2 features:
##                ID                           geom
## 1 alabama,autauga MULTIPOLYGON (((-86.50517 3...
## 2 alabama,baldwin MULTIPOLYGON (((-87.93757 3...

There are several ways we can extract information from an sf object.

#Extract the first polygon and make a plot
poly1 = st_geometry (county.map)[[1]]

#Print all the points used to draw the polygon
poly1

plot (poly1)

Our first data task is to merge the dataset with the sf object. In our case, the counties are ordered by row exactly the same for the two datasets and we can use cbind. In general, it’s good to merge by polygon ID.

#Option 1
dat.areal = cbind (county.map, dat)
dat.areal[1:3,]
## Simple feature collection with 3 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -88.01778 ymin: 30.24071 xmax: -85.06131 ymax: 32.71016
## Geodetic CRS:  WGS 84
##                ID  FIPS           Area State Population Cases Income inc
## 1 alabama,autauga 01001 Autauga County    AL      49960   184  42013  37
## 2 alabama,baldwin 01003 Baldwin County    AL     171769   416  40250  24
## 3 alabama,barbour 01005 Barbour County    AL      27941   165  25101  59
##                             geom
## 1 MULTIPOLYGON (((-86.50517 3...
## 2 MULTIPOLYGON (((-87.93757 3...
## 3 MULTIPOLYGON (((-85.42801 3...
#Option 2

data (county.fips) #Load county 5-digit FIPS code
county.map$FIPS = county.fips$fips[ match (county.map$ID, county.fips$polyname)]
county.map$FIPS =  str_pad (county.map$FIPS, 5, "left", "0")
all (county.map$FIPS %in% dat$FIPS) 
## [1] TRUE
dat.areal2 = merge (county.map, dat, by = "FIPS")

The object dat.areal$* now contains the polygon information and the dataframe we would like to. We can do standard operations on the dataframe now. We can also create chloropleth map!

dim (dat.areal)
## [1] 226   9
names(dat.areal)
## [1] "ID"         "FIPS"       "Area"       "State"      "Population"
## [6] "Cases"      "Income"     "inc"        "geom"
plot (dat.areal["Income"], main = "Median Household Income in 2000")

plot (dat.areal["inc"],  main = "Chlamydia Incidence (per 10,000) in 2007")

Or use ggplot.

ggplot() + geom_sf (data = dat.areal, aes (fill = Income))+
  labs(title="Median Household Income in 2000")   +scale_fill_gradient2(low = "white",high = "red", limits = c(0, 80000))

ggplot() +  geom_sf (data = dat.areal, aes (fill = inc))+
  labs(title="Chlamydia Incidence (per 10,000)")  +scale_fill_gradient2(low = "white",high = "blue", limits = c(0, 140))

3.2 Defining Spatial Proximity

The package spdep () contains a suite of functions for working with areal spatial data. The poly2nb () function identifies neighbours using a spatial polygons. Neighbours are defined as sharing a common boundary point.The default in poly2nb uses the queen’s case definition that is two polygons are neighbours even if they share a single boundary. This can be suppressed by the queen = option.

#Because our shapefile here is un-projected, distance operations can be difficult. Alternatively, we can project the data to, for example, the Lambert Conformal Conic North America.
dat.areal_proj = st_transform(dat.areal, crs = "ESRI:102004")
plot (st_geometry(dat.areal_proj))

nb = poly2nb (dat.areal_proj)
nb2 = poly2nb (dat.areal_proj, queen=FALSE)

#A lot of good summaries about the neighbor structure
summary (nb)
## Neighbour list object:
## Number of regions: 226 
## Number of nonzero links: 1258 
## Percentage nonzero weights: 2.462996 
## Average number of links: 5.566372 
## Link number distribution:
## 
##  2  3  4  5  6  7  8  9 10 
##  3 12 37 50 70 41  8  3  2 
## 3 least connected regions:
## 90 92 186 with 2 links
## 2 most connected regions:
## 120 127 with 10 links

The function nb2mat () then creats a proximity/weight matrix using the neighbourhood information. Here we have several options to define weights:

W = nb2mat (nb, style = "B") 
center_proj = st_centroid(dat.areal_proj) ##Extract centroids of polygons
center = st_transform(center_proj, crs = 4326) #Re-project back to WGS 84
center = st_coordinates (center)                 

plot (st_geometry(dat.areal))
plot (nb, center, add = TRUE, col = "blue")
plot (nb2, center, add = TRUE, col = "red", lwd = 1)
title (main="Blue lines = additional neighbours by queen's case")

## Using kth-nearest distance
nb.k1 = knn2nb (knearneigh(center_proj,k=1))
nb.k2 = knn2nb (knearneigh(center_proj,k=2), row.names=row.names(center))

plot (st_geometry(dat.areal))
plot (nb.k2, center, add = TRUE, col = "red", lwd = 1)
plot (nb.k1, center, add = TRUE, col = "blue", lwd = 1)
title (main="Blue lines = additional second nearest-neighbour")

## Using buffer distance
nb.buffer1 = dnearneigh(center_proj, d1=0, d2 = 25*1000)
nb.buffer2 = dnearneigh(center_proj, d1=0, d2 = 40*1000)

plot (st_geometry(dat.areal))
plot (nb.buffer2, center, add = TRUE, col = "red", lwd = 1)
plot (nb.buffer1, center, add = TRUE, col = "blue", lwd =2)
legend ("topright", legend =c("<25 km", "< 40 km"), col= c("blue", "red"), pch = 16)

3.3 Spatial Lag Regression

Let’s first produce a Moran Plot. We first show how to do it by hand. The moran.plot () function in spdep will also identify high influence points.

Y = dat.areal$Income

nb = poly2nb (dat.areal_proj)
W = nb2mat (nb, style = "W") 

WY = W%*%Y
plot (WY~Y, col = 4, ylab = "Spatially Lagged Wegithed Income", xlab = "Income", cex.lab = 1.4, cex.axis = 1.2, cex = 1.2)
abline (h = mean (WY), lty = 2); abline(v=mean(Y), lty=2)
abline(0,1)

#Here we need to use the weighted adjancy matrix (defult in *nb2listw*)
moran.plot (Y, nb2listw(nb))

Calculate Moran’s I by hand and perform asymptotic hypothesis test.

ybar = mean (Y)
r = Y - ybar
I = sum(r%*% t(r)*W)/ sum (r^2)*nrow(W)/sum(W)
I
## [1] 0.5409861
#Note that the Moran's statistic is the same as the slope from the previous regression model
coef (lm (WY~Y))
##  (Intercept)            Y 
## 1.517555e+04 5.409861e-01
#The default performs non-Gaussian test under randomization
col.W <- nb2listw(nb, style="W")
moran.test (Y, col.W)
## 
##  Moran I test under randomisation
## 
## data:  Y  
## weights: col.W    
## 
## Moran I statistic standard deviate = 13.634, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.540986055      -0.004444444       0.001600329
#Assume Y is Gaussian
moran.test (Y, col.W, randomisation = FALSE)
## 
##  Moran I test under normality
## 
## data:  Y  
## weights: col.W    
## 
## Moran I statistic standard deviate = 13.547, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.540986055      -0.004444444       0.001620976

Moran’s I hypothesis test by permutation

I.perm = moran.mc (Y, col.W, 10000)
I.perm
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  Y 
## weights: col.W  
## number of simulations + 1: 10001 
## 
## statistic = 0.54099, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
hist (I.perm$res, main = "Moran's I under Null Hypothesis", xlab = "", ylab ="")
abline (v = 0.567, col = 2, lwd = 4)
text (0.5, 20000, "Observed \n Moran's I", col = 2)

Geary’s C

##Geary's C
geary.test (Y, col.W)
## 
##  Geary C test under randomisation
## 
## data:  Y 
## weights: col.W 
## 
## Geary C statistic standard deviate = 12.987, p-value < 2.2e-16
## alternative hypothesis: Expectation greater than statistic
## sample estimates:
## Geary C statistic       Expectation          Variance 
##       0.425651698       1.000000000       0.001955928
C.perm = geary.mc (Y, col.W, 10000)
hist (C.perm$res, main = "Geary's C under Null Hypothesis", xlab = "", ylab ="")
abline (v = 0.4295, col = 2, lwd = 4)
text (0.5, 15000, "Observed \n Geary's C", col = 2)

# 3.4 Local Moran’s I

To estimate local Moran’s I for chlaymia incidence rate, we need a first-order adjacency matrix W. The localmoran () results gives:

Y = dat.areal$inc
col.W <- nb2listw(nb, style="B")

I.local = localmoran (Y, col.W)
I.local[1:4,]
##          Ii         E.Ii    Var.Ii       Z.Ii Pr(z != E(Ii))
## 1 -2.340030 -0.002422875 0.5375311 -3.1883775    0.001430736
## 2 -1.103451 -0.011766813 2.6159660 -0.6749646    0.499698269
## 3  2.445618 -0.007403101 1.6265637  1.9233832    0.054431941
## 4 -4.071777 -0.011786781 2.5992363 -2.5182691    0.011793319

Next, we extract the local Moran’s I results and include them in our spatial dataframe.

dat.areal$I.local = I.local[,1]
I.local.p = I.local[,5] 
I.local.p_bonf = p.adjust (I.local[,5], method ="bonferroni")
I.local.p_holm = p.adjust (I.local[,5], method ="holm")
I.local.p_fdr = p.adjust (I.local[,5], method ="fdr")

plot.dat = data.frame (FIPS = dat.areal$FIPS,
                  type =rep(c("Raw", "Bonferroni", "Holm", "FDR"), each = length (I.local.p)),
                  p = c(I.local.p, I.local.p_bonf, I.local.p_holm, I.local.p_fdr))
plot.dat$p=as.factor(cut(plot.dat$p, c(0,0.01, 0.05, 1), right=TRUE ))
levels (plot.dat$p) = c("<0.01", "0.01-0.05", ">0.05")
plot.dat$type = factor(plot.dat$type, levels = c("Raw", "Bonferroni", "Holm", "FDR"))

plot.dat = merge (dat.areal, plot.dat, by = "FIPS")
ggplot () + geom_sf (data = plot.dat, aes (fill = p)) + facet_wrap(~type)+
  scale_fill_discrete(name = "p-value")

To avoid relying on asymptotic normality, we can also perform a permutation test. Here we re-sample the incidence data with replacement 10,000 times. Each time, we calculate the local Moran’s I statistics. First, note that under H_0 of no spatial dependence, the local Moran’s I statistics exhibit large skewness compared to a normal distribution.

#Number of permutation
n.iter = 10000 

#Row = 226 counties, column = permutation
I.keep = matrix (NA, ncol = n.iter, nrow = length (Y))
Y = dat.areal$inc

for (i in 1:n.iter){ 
  if (i %% 1000 ==0){print (i)}
  I.keep[,i] <- localmoran (sample (Y, length(Y), replace=T), col.W)[,1]
}
## [1] 1000
## [1] 2000
## [1] 3000
## [1] 4000
## [1] 5000
## [1] 6000
## [1] 7000
## [1] 8000
## [1] 9000
## [1] 10000
#Check the normality for local Moran's I for county 1 and county 20
par (mfrow = c(1,2))
qqnorm(I.keep[1,], main = "Moran's I (Null Distribution)\n County ID = 1");qqline(I.keep[1,])
qqnorm(I.keep[20,], main = "Moran's I (Null Distribution)\n County ID = 20");qqline(I.keep[20,])

I.obs = localmoran (Y, col.W)[,1]

#Calculate P(Local Moran's I > observed | Null)  
P_perm = apply ( sweep (I.keep, 1, I.obs, ">" ), 1, mean) 
P_perm[P_perm==0] = 1/n.iter

dat.areal$MoranI_perm_p= cut(p.adjust(P_perm, "bonferroni"), c(0,0.01, 0.05, 1) )
levels (dat.areal$MoranI_perm_p) = c("<0.01", "0.01-0.05", ">0.05")

ggplot () + geom_sf (data = dat.areal, aes (fill = MoranI_perm_p)) + 
  ggtitle("Permutation-based Bonferroni-corrected") + scale_fill_discrete(name = "p-value")

3.5 Global Model-Based Cluster Detection

We will now focus on performing cluster detection specifically for case incidence data. We will use functions in the R package DCluster. These functions often require the input as a dataset with two variable names: Observed and Expected. First, we perform tests to see if spatial clustering exists.

dismap = data.frame (Observed = dat.areal$Cases, Pop = dat.areal$Population)
theta_0 = sum(dismap$Observed)/sum(dismap$Pop)
dismap$Expected = dismap$Pop * theta_0

################
#   Chi2 test  #
################
#Asymptotic test
achisq.stat (dismap)
## $T
## [1] 21755.29
## 
## $df
## [1] 225
## 
## $pvalue
## [1] 0
#Simulation-based
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000)
## Chi-square test for overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Poisson 
##  Number of simulations: 1000 
##  Statistic:  21755.29 
##  p-value :  0.000999001
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "multinom", R=1000)
## Chi-square test for overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Multinomial 
##  Number of simulations: 1000 
##  Statistic:  21755.29 
##  p-value :  0.000999001
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "negbin", R=1000)
## Chi-square test for overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Negative Binomial 
##  Number of simulations: 1000 
##  Statistic:  21755.29 
##  p-value :  0.3976024
##############################
# Potthoff-Whittinghill test #
##############################
pottwhitt.test(Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000)
## Potthoff-Whittinghill's test of overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Poisson 
##  Number of simulations: 1000 
##  Statistic:  6032784494 
##  p-value :  0.000999001
pottwhitt.test(Observed~offset(log(Expected)), data = dismap, model = "multinom", R=1000)
## Potthoff-Whittinghill's test of overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Multinomial 
##  Number of simulations: 1000 
##  Statistic:  6032784494 
##  p-value :  0.000999001
###############
#  Moran's I  #
###############
nb = poly2nb (dat.areal_proj)
col.W = nb2listw(nb)
moranI.test (Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000, listw = col.W, n = nrow(dismap), S0 = Szero(col.W))
## Moran's I test of spatial autocorrelation 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Poisson 
##  Number of simulations: 1000 
##  Statistic:  0.3525243 
##  p-value :  0.000999001
======= Areal-AutoCorr.knit

SISMID Spatial Satistics

Data Lab 3 Working with Areal Data


3.1 Working with Spatial Polygons

We will first load a standard data frame consisting of several county-level variables:

library(here)
library(sf)
library(maps)
library(stringr)
library(colorRamps)
library(ggplot2)
library(spdep)
library(DCluster)

load ( here("data", "Data.RData") )
summary (dat)
##      FIPS                        Area     State      Population    
##  Length:226         Baldwin County :  2   AL: 67   Min.   :  1884  
##  Class :character   Bibb County    :  2   GA:159   1st Qu.: 13321  
##  Mode  :character   Calhoun County :  2            Median : 24790  
##                     Cherokee County:  2            Mean   : 62711  
##                     Clarke County  :  2            3rd Qu.: 61806  
##                     Clay County    :  2            Max.   :992137  
##                     (Other)        :214                            
##      Cases            Income           inc        
##  Min.   :   6.0   Min.   :16646   Min.   :  5.00  
##  1st Qu.:  48.0   1st Qu.:27477   1st Qu.: 25.00  
##  Median :  91.0   Median :31186   Median : 41.00  
##  Mean   : 299.3   Mean   :33255   Mean   : 45.88  
##  3rd Qu.: 223.8   3rd Qu.:36738   3rd Qu.: 61.00  
##  Max.   :6139.0   Max.   :71227   Max.   :131.00  
## 

Next, we will read in county spatial polygons for the contiguous US in the maps package. R can also read in any shapefile via the sf package. The st_as_sf function converts the map object to an sf object, which encodes the ID, polygons, and any unit-specific data value.

county.map = map ("county", region = c("alabama", "georgia"),fill = T,  plot = F)
county.map = sf::st_as_sf(county.map) #Convert from map object to sf

plot (st_geometry(county.map)) #only plot the polygons

Our sf object contains several attributes:

str (county.map)
## Classes 'sf' and 'data.frame':   226 obs. of  2 variables:
##  $ ID  : chr  "alabama,autauga" "alabama,baldwin" "alabama,barbour" "alabama,bibb" ...
##  $ geom:sfc_MULTIPOLYGON of length 226; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:51, 1:2] -86.5 -86.5 -86.5 -86.6 -86.6 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geom"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA
##   ..- attr(*, "names")= chr "ID"
attributes (county.map)
## $names
## [1] "ID"   "geom"
## 
## $row.names
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [91]  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## [199] 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## [217] 217 218 219 220 221 222 223 224 225 226
## 
## $sf_column
## [1] "geom"
## 
## $agr
##   ID 
## <NA> 
## Levels: constant aggregate identity
## 
## $class
## [1] "sf"         "data.frame"

Let’s look some meta data information:

print (county.map, n = 2)
## Simple feature collection with 226 features and 1 field
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -88.47614 ymin: 30.24071 xmax: -80.84435 ymax: 35.01345
## Geodetic CRS:  WGS 84
## First 2 features:
##                ID                           geom
## 1 alabama,autauga MULTIPOLYGON (((-86.50517 3...
## 2 alabama,baldwin MULTIPOLYGON (((-87.93757 3...

There are several ways we can extract information from an sf object.

#Extract the first polygon and make a plot
poly1 = st_geometry (county.map)[[1]]

#Print all the points used to draw the polygon
poly1

plot (poly1)

Our first data task is to merge the dataset with the sf object. In our case, the counties are ordered by row exactly the same for the two datasets and we can use cbind. In general, it’s good to merge by polygon ID.

#Option 1
dat.areal = cbind (county.map, dat)
dat.areal[1:3,]
## Simple feature collection with 3 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -88.01778 ymin: 30.24071 xmax: -85.06131 ymax: 32.71016
## Geodetic CRS:  WGS 84
##                ID  FIPS           Area State Population Cases Income inc
## 1 alabama,autauga 01001 Autauga County    AL      49960   184  42013  37
## 2 alabama,baldwin 01003 Baldwin County    AL     171769   416  40250  24
## 3 alabama,barbour 01005 Barbour County    AL      27941   165  25101  59
##                             geom
## 1 MULTIPOLYGON (((-86.50517 3...
## 2 MULTIPOLYGON (((-87.93757 3...
## 3 MULTIPOLYGON (((-85.42801 3...
#Option 2

data (county.fips) #Load county 5-digit FIPS code
county.map$FIPS = county.fips$fips[ match (county.map$ID, county.fips$polyname)]
county.map$FIPS =  str_pad (county.map$FIPS, 5, "left", "0")
all (county.map$FIPS %in% dat$FIPS) 
## [1] TRUE
dat.areal2 = merge (county.map, dat, by = "FIPS")

The object dat.areal$* now contains the polygon information and the dataframe we would like to. We can do standard operations on the dataframe now. We can also create chloropleth map!

dim (dat.areal)
## [1] 226   9
names(dat.areal)
## [1] "ID"         "FIPS"       "Area"       "State"      "Population"
## [6] "Cases"      "Income"     "inc"        "geom"
plot (dat.areal["Income"], main = "Median Household Income in 2000")

plot (dat.areal["inc"],  main = "Chlamydia Incidence (per 10,000) in 2007")

Or use ggplot.

ggplot() + geom_sf (data = dat.areal, aes (fill = Income))+
  labs(title="Median Household Income in 2000")   +scale_fill_gradient2(low = "white",high = "red", limits = c(0, 80000))

ggplot() +  geom_sf (data = dat.areal, aes (fill = inc))+
  labs(title="Chlamydia Incidence (per 10,000)")  +scale_fill_gradient2(low = "white",high = "blue", limits = c(0, 140))

3.2 Defining Spatial Proximity

The package spdep () contains a suite of functions for working with areal spatial data. The poly2nb () function identifies neighbours using a spatial polygons. Neighbours are defined as sharing a common boundary point.The default in poly2nb uses the queen’s case definition that is two polygons are neighbours even if they share a single boundary. This can be suppressed by the queen = option.

#Because our shapefile here is un-projected, distance operations can be difficult. Alternatively, we can project the data to, for example, the Lambert Conformal Conic North America.
dat.areal_proj = st_transform(dat.areal, crs = "ESRI:102004")
plot (st_geometry(dat.areal_proj))

nb = poly2nb (dat.areal_proj)
nb2 = poly2nb (dat.areal_proj, queen=FALSE)

#A lot of good summaries about the neighbor structure
summary (nb)
## Neighbour list object:
## Number of regions: 226 
## Number of nonzero links: 1258 
## Percentage nonzero weights: 2.462996 
## Average number of links: 5.566372 
## Link number distribution:
## 
##  2  3  4  5  6  7  8  9 10 
##  3 12 37 50 70 41  8  3  2 
## 3 least connected regions:
## 90 92 186 with 2 links
## 2 most connected regions:
## 120 127 with 10 links

The function nb2mat () then creats a proximity/weight matrix using the neighbourhood information. Here we have several options to define weights:

W = nb2mat (nb, style = "B") 
center_proj = st_centroid(dat.areal_proj) ##Extract centroids of polygons
center = st_transform(center_proj, crs = 4326) #Re-project back to WGS 84
center = st_coordinates (center)                 

plot (st_geometry(dat.areal))
plot (nb, center, add = TRUE, col = "blue")
plot (nb2, center, add = TRUE, col = "red", lwd = 1)
title (main="Blue lines = additional neighbours by queen's case")

## Using kth-nearest distance
nb.k1 = knn2nb (knearneigh(center_proj,k=1))
nb.k2 = knn2nb (knearneigh(center_proj,k=2), row.names=row.names(center))

plot (st_geometry(dat.areal))
plot (nb.k2, center, add = TRUE, col = "red", lwd = 1)
plot (nb.k1, center, add = TRUE, col = "blue", lwd = 1)
title (main="Blue lines = additional second nearest-neighbour")

## Using buffer distance
nb.buffer1 = dnearneigh(center_proj, d1=0, d2 = 25*1000)
nb.buffer2 = dnearneigh(center_proj, d1=0, d2 = 40*1000)

plot (st_geometry(dat.areal))
plot (nb.buffer2, center, add = TRUE, col = "red", lwd = 1)
plot (nb.buffer1, center, add = TRUE, col = "blue", lwd =2)
legend ("topright", legend =c("<25 km", "< 40 km"), col= c("blue", "red"), pch = 16)

3.3 Spatial Lag Regression

Let’s first produce a Moran Plot. We first show how to do it by hand. The moran.plot () function in spdep will also identify high influence points.

Y = dat.areal$Income

nb = poly2nb (dat.areal_proj)
W = nb2mat (nb, style = "W") 

WY = W%*%Y
plot (WY~Y, col = 4, ylab = "Spatially Lagged Wegithed Income", xlab = "Income", cex.lab = 1.4, cex.axis = 1.2, cex = 1.2)
abline (h = mean (WY), lty = 2); abline(v=mean(Y), lty=2)
abline(0,1)

#Here we need to use the weighted adjancy matrix (defult in *nb2listw*)
moran.plot (Y, nb2listw(nb))

Calculate Moran’s I by hand and perform asymptotic hypothesis test.

ybar = mean (Y)
r = Y - ybar
I = sum(r%*% t(r)*W)/ sum (r^2)*nrow(W)/sum(W)
I
## [1] 0.5409861
#Note that the Moran's statistic is the same as the slope from the previous regression model
coef (lm (WY~Y))
##  (Intercept)            Y 
## 1.517555e+04 5.409861e-01
#The default performs non-Gaussian test under randomization
col.W <- nb2listw(nb, style="W")
moran.test (Y, col.W)
## 
##  Moran I test under randomisation
## 
## data:  Y  
## weights: col.W    
## 
## Moran I statistic standard deviate = 13.634, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.540986055      -0.004444444       0.001600329
#Assume Y is Gaussian
moran.test (Y, col.W, randomisation = FALSE)
## 
##  Moran I test under normality
## 
## data:  Y  
## weights: col.W    
## 
## Moran I statistic standard deviate = 13.547, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.540986055      -0.004444444       0.001620976

Moran’s I hypothesis test by permutation

I.perm = moran.mc (Y, col.W, 10000)
I.perm
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  Y 
## weights: col.W  
## number of simulations + 1: 10001 
## 
## statistic = 0.54099, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
hist (I.perm$res, main = "Moran's I under Null Hypothesis", xlab = "", ylab ="")
abline (v = 0.567, col = 2, lwd = 4)
text (0.5, 20000, "Observed \n Moran's I", col = 2)

Geary’s C

##Geary's C
geary.test (Y, col.W)
## 
##  Geary C test under randomisation
## 
## data:  Y 
## weights: col.W 
## 
## Geary C statistic standard deviate = 12.987, p-value < 2.2e-16
## alternative hypothesis: Expectation greater than statistic
## sample estimates:
## Geary C statistic       Expectation          Variance 
##       0.425651698       1.000000000       0.001955928
C.perm = geary.mc (Y, col.W, 10000)
hist (C.perm$res, main = "Geary's C under Null Hypothesis", xlab = "", ylab ="")
abline (v = 0.4295, col = 2, lwd = 4)
text (0.5, 15000, "Observed \n Geary's C", col = 2)

# 3.4 Local Moran’s I

To estimate local Moran’s I for chlaymia incidence rate, we need a first-order adjacency matrix W. The localmoran () results gives:

Y = dat.areal$inc
col.W <- nb2listw(nb, style="B")

I.local = localmoran (Y, col.W)
I.local[1:4,]
##          Ii         E.Ii    Var.Ii       Z.Ii Pr(z != E(Ii))
## 1 -2.340030 -0.002422875 0.5375311 -3.1883775    0.001430736
## 2 -1.103451 -0.011766813 2.6159660 -0.6749646    0.499698269
## 3  2.445618 -0.007403101 1.6265637  1.9233832    0.054431941
## 4 -4.071777 -0.011786781 2.5992363 -2.5182691    0.011793319

Next, we extract the local Moran’s I results and include them in our spatial dataframe.

dat.areal$I.local = I.local[,1]
I.local.p = I.local[,5] 
I.local.p_bonf = p.adjust (I.local[,5], method ="bonferroni")
I.local.p_holm = p.adjust (I.local[,5], method ="holm")
I.local.p_fdr = p.adjust (I.local[,5], method ="fdr")

plot.dat = data.frame (FIPS = dat.areal$FIPS,
                  type =rep(c("Raw", "Bonferroni", "Holm", "FDR"), each = length (I.local.p)),
                  p = c(I.local.p, I.local.p_bonf, I.local.p_holm, I.local.p_fdr))
plot.dat$p=as.factor(cut(plot.dat$p, c(0,0.01, 0.05, 1), right=TRUE ))
levels (plot.dat$p) = c("<0.01", "0.01-0.05", ">0.05")
plot.dat$type = factor(plot.dat$type, levels = c("Raw", "Bonferroni", "Holm", "FDR"))

plot.dat = merge (dat.areal, plot.dat, by = "FIPS")
ggplot () + geom_sf (data = plot.dat, aes (fill = p)) + facet_wrap(~type)+
  scale_fill_discrete(name = "p-value")

To avoid relying on asymptotic normality, we can also perform a permutation test. Here we re-sample the incidence data with replacement 10,000 times. Each time, we calculate the local Moran’s I statistics. First, note that under H_0 of no spatial dependence, the local Moran’s I statistics exhibit large skewness compared to a normal distribution.

#Number of permutation
n.iter = 10000 

#Row = 226 counties, column = permutation
I.keep = matrix (NA, ncol = n.iter, nrow = length (Y))
Y = dat.areal$inc

for (i in 1:n.iter){ 
  if (i %% 1000 ==0){print (i)}
  I.keep[,i] <- localmoran (sample (Y, length(Y), replace=T), col.W)[,1]
}
## [1] 1000
## [1] 2000
## [1] 3000
## [1] 4000
## [1] 5000
## [1] 6000
## [1] 7000
## [1] 8000
## [1] 9000
## [1] 10000
#Check the normality for local Moran's I for county 1 and county 20
par (mfrow = c(1,2))
qqnorm(I.keep[1,], main = "Moran's I (Null Distribution)\n County ID = 1");qqline(I.keep[1,])
qqnorm(I.keep[20,], main = "Moran's I (Null Distribution)\n County ID = 20");qqline(I.keep[20,])

I.obs = localmoran (Y, col.W)[,1]

#Calculate P(Local Moran's I > observed | Null)  
P_perm = apply ( sweep (I.keep, 1, I.obs, ">" ), 1, mean) 
P_perm[P_perm==0] = 1/n.iter

dat.areal$MoranI_perm_p= cut(p.adjust(P_perm, "bonferroni"), c(0,0.01, 0.05, 1) )
levels (dat.areal$MoranI_perm_p) = c("<0.01", "0.01-0.05", ">0.05")

ggplot () + geom_sf (data = dat.areal, aes (fill = MoranI_perm_p)) + 
  ggtitle("Permutation-based Bonferroni-corrected") + scale_fill_discrete(name = "p-value")

3.5 Global Model-Based Cluster Detection

We will now focus on performing cluster detection specifically for case incidence data. We will use functions in the R package DCluster. These functions often require the input as a dataset with two variable names: Observed and Expected. First, we perform tests to see if spatial clustering exists.

dismap = data.frame (Observed = dat.areal$Cases, Pop = dat.areal$Population)
theta_0 = sum(dismap$Observed)/sum(dismap$Pop)
dismap$Expected = dismap$Pop * theta_0

################
#   Chi2 test  #
################
#Asymptotic test
achisq.stat (dismap)
## $T
## [1] 21755.29
## 
## $df
## [1] 225
## 
## $pvalue
## [1] 0
#Simulation-based
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000)
## Chi-square test for overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Poisson 
##  Number of simulations: 1000 
##  Statistic:  21755.29 
##  p-value :  0.000999001
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "multinom", R=1000)
## Chi-square test for overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Multinomial 
##  Number of simulations: 1000 
##  Statistic:  21755.29 
##  p-value :  0.000999001
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "negbin", R=1000)
## Chi-square test for overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Negative Binomial 
##  Number of simulations: 1000 
##  Statistic:  21755.29 
##  p-value :  0.3976024
##############################
# Potthoff-Whittinghill test #
##############################
pottwhitt.test(Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000)
## Potthoff-Whittinghill's test of overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Poisson 
##  Number of simulations: 1000 
##  Statistic:  6032784494 
##  p-value :  0.000999001
pottwhitt.test(Observed~offset(log(Expected)), data = dismap, model = "multinom", R=1000)
## Potthoff-Whittinghill's test of overdispersion 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Multinomial 
##  Number of simulations: 1000 
##  Statistic:  6032784494 
##  p-value :  0.000999001
###############
#  Moran's I  #
###############
nb = poly2nb (dat.areal_proj)
col.W = nb2listw(nb)
moranI.test (Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000, listw = col.W, n = nrow(dismap), S0 = Szero(col.W))
## Moran's I test of spatial autocorrelation 
## 
##  Type of boots.: parametric 
##  Model used when sampling: Poisson 
##  Number of simulations: 1000 
##  Statistic:  0.3525243 
##  p-value :  0.000999001
>>>>>>> 1229a265105ae69160fe8c28ca27647c2b02910c